library(tidyverse)
library(data.table)
library(ggplot2)
library(readxl)
library(cluster)
library(plotly)
library(fpc)
# Set seed
set.seed(12345)
# Set working directory
setwd("~/Desktop/Fall 2019 Academics/HS 256_Healthcare data analytics and Data Mining/HW4")
# inpatient discharge data,
df_ip <- fread("VTINP16_upd.TXT")
# Revenue Code data
df_rev_code <- fread("VTREVCODE16.TXT") %>%
# exclude the low dollar value services (less than $100) by dropping the REVCHRGS <100
# Note that with this filter 7 unique PCCR are removed hence we only have 52 PCCRs
# in the proceeding analysis instead of the original 57
filter(REVCHRGS >= 100)
# DRG names: These can be found in FILE_LAYOUT_and_CODES -"MSDRG 2007 forward" tab
DRG_Names <- read_excel("DRG_PCCR_Names.xlsx", sheet = "DRG")
# PCCR names: These can be found in REVCODE_FILE_LAYOUT_and_CODES -"PCCR" tab
PCCR_Names <- read_excel("DRG_PCCR_Names.xlsx", sheet = "PCCR") %>%
mutate(PCCR = as.numeric(PCCR)) # Data Preparation --------------------------------------------------------
df_ip_cluster <- df_ip %>%
# Filter your hospital admissions to only important DRGs between 20 and 977:
# With this filter we lose 15 uniques DRGs
filter(between(DRG, 20, 977)) %>%
# Link your filtered Inpatient data to the Revenue Code file using the UNIQ variable
right_join(df_rev_code, by = c("UNIQ" = "Uniq")) %>%
# Now sum all the charges by the PCCR category
group_by(UNIQ,DRG,PCCR) %>%
summarise(sum_charges = sum(REVCHRGS)) %>%
# cross tabulate your list of selected DRGs (in the row) and the mean value of the PCCRs,
# as cell value
group_by(DRG, PCCR) %>%
summarise(avg_cost = round(mean(sum_charges),2)) %>%
filter(!is.na(avg_cost)) %>%
# Cross tab
pivot_wider(names_from = PCCR, values_from = avg_cost)
# Cross tab dimensions
print(dim(df_ip_cluster))## [1] 703 59
# NAs to zero
df_ip_cluster[is.na(df_ip_cluster)] <- 0
df_ip_cluster <- df_ip_cluster %>%
# select PCCR 3700 Operating Room & PCCR 4000 Anesthesiology
mutate(Operating_Room = as.numeric(`3700`),
Anesthesiology = as.numeric(`4000`)) %>%
# Create a new cost category by combining the PCCR 3700 Operating Room + PCCR 4000 Anesthesiology
mutate(PCCR_OR_and_Anesth_Costs = Operating_Room + Anesthesiology) %>%
# plug in the DRG names to your cross-tabulated file
left_join(DRG_Names, by = c("DRG" = "DRG")) %>%
ungroup() %>%
select(DRG_Name, Operating_Room, Anesthesiology,PCCR_OR_and_Anesth_Costs, everything())
#
head(df_ip_cluster, n=10)Results with three clusters
# Cluster Analysis --------------------------------------------------------
# k_means clustering
df_ip_cluster_kmeans <- df_ip_cluster %>%
select(PCCR_OR_and_Anesth_Costs)
# k = 2
k_means_2 <- kmeans(df_ip_cluster_kmeans, 2)
CHIndex_2 <- round(calinhara(df_ip_cluster_kmeans,k_means_2$cluster),3)
wss_2 <- round(k_means_2$tot.withinss,3)
# k = 3
k_means_3 <- kmeans(df_ip_cluster_kmeans, 3)
CHIndex_3 <- round(calinhara(df_ip_cluster_kmeans,k_means_3$cluster),3)
wss_3 <- round(k_means_3$tot.withinss,3)
# k = 4
k_means_4 <- kmeans(df_ip_cluster_kmeans, 4)
CHIndex_4 <- round(calinhara(df_ip_cluster_kmeans,k_means_4$cluster),3)
wss_4 <- round(k_means_4$tot.withinss,3)
# k = 5
k_means_5 <- kmeans(df_ip_cluster_kmeans, 5)
CHIndex_5 <- round(calinhara(df_ip_cluster_kmeans,k_means_5$cluster),3)
wss_5 <- round(k_means_5$tot.withinss,3)
# k_means summary table
k <- as.character(c(2,3,4,5))
CHIndex <- c(CHIndex_2, CHIndex_3, CHIndex_4, CHIndex_5)
wss <- c(wss_2, wss_3, wss_4, wss_5)
k_means_sum <- as.data.frame(cbind(k, CHIndex,wss))
# k_means = 3: create dataframe for cluster plot:
df_ip_3_clusters <- df_ip_cluster_kmeans %>%
# obtain clusters
cbind(k_means_3$cluster) %>%
# obtain DRG names
cbind(df_ip_cluster$DRG_Name) %>%
# renane variables
mutate(cluster = as.character(`k_means_3$cluster`),
DRG_Name =`df_ip_cluster$DRG_Name`) %>%
left_join(DRG_Names, by = c("DRG_Name" = "DRG_Name")) %>%
# Select only relevant columns
select(`DRG`, DRG_Name, PCCR_OR_and_Anesth_Costs,cluster)
# Note that DRG 274 is missing in the FILE_LAYOUT_AND_CODES.xlsf file although
# it is found in the Inpatient file.We inpute the value for this DRG based on online resources
# df_ip_3_clusters$DRG_Name[is.na(df_ip_3_clusters$DRG_Name)] <- "PERCUTANEOUS INTRACARDIAC PROCEDURES WITHOUT MCC"
head(df_ip_3_clusters, n=10) plot_1a <- ggplot(df_ip_3_clusters, aes(x=DRG, y= PCCR_OR_and_Anesth_Costs,
color = cluster)) +
geom_point(aes(text=DRG_Name)) +
ggtitle("K-means Clustering: Inpatient DRGs") +
xlab("DRG Code") +
ylab("Average Cost-Operating Room and Anesthesiology ($)")+
theme(
legend.position = "right",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey"))
plot_1b <- ggplotly(plot_1a)
# Plot 2
k_means_sum_plots <- k_means_sum %>%
mutate(k = as.numeric(as.character(k)),
CHIndex = as.numeric(as.character(CHIndex)),
wss = as.numeric(as.character(wss)))
plot_2 <- ggplot(data = k_means_sum_plots, aes(x=k, y = wss)) +
geom_line(linetype = 2, colour = "blue", size=0.4 ) +
geom_point(shape=18, size =3, color ="blue")+
xlab("K") +
ylab("Total Within cluster Sum of Squares by cluster (TWSS)") +
ggtitle("Scree plot") +
theme(
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey"))
# Plot 3
plot_3 <- ggplot(data = k_means_sum_plots, aes(x=k, y =CHIndex )) +
geom_line(linetype = 2, colour = "blue", size=0.4 ) +
geom_point(shape=18, size =3, color ="blue")+
xlab("K") +
ylab("Calinski-Harabasz F-statistics") +
ggtitle("Optimal number of clusters") +
theme(
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "grey"))
plot_1aplot_1bplot_2plot_3